home *** CD-ROM | disk | FTP | other *** search
- #
- # SiteDB.py
- # JunkMatcher
- #
- # Created by Benjamin Han on 2/1/05.
- # Copyright (c) 2005 Benjamin Han. All rights reserved.
- #
-
- # This program is free software; you can redistribute it and/or
- # modify it under the terms of the GNU General Public License
- # as published by the Free Software Foundation; either version 2
- # of the License, or (at your option) any later version.
-
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
-
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
- #!/usr/bin/env python
-
- # IMPORTANT: SiteDB can have only a single instance!
-
- import datetime, copy, string
-
- from consts import *
- from utilities import *
- from rwlock import * # to ensure thread-safety, since we will have only one global SiteDB
-
-
- _gTLD=sets.Set(openFile('%setc/gTLD' % ENGINE_PATH).read().split('\n')[:-1])
- _gTLD.add('arpa') # to be 100% precise arpa is not a generic TLD
- _ccTLD=sets.Set(openFile('%setc/ccTLD' % ENGINE_PATH).read().split('\n')[:-1])
-
- # SiteDB is implemented to accomodate multiple readers and (xor) single writer
- _siteDBRWLock = RWLock()
-
-
- class SiteDB (dict):
- """Bad sites collection: a tree
- ----------------------------
- fn: the filename
- count: the # of unique sites (or leaf nodes)
- sizeLimit: the max capacity (count will be kept less than sizeLimit; 0 with no limit)
-
- Note: each node is a list [time, count, level, children]
- - time: # of seconds since epoch in UTC when the last time the node is updated
- - count: # of occurrences
- - level: depth in the tree (0 is the top-level, i.e., TLD)
- - children: a dictionary or None (if it's a leaf)
- """
- def __init__ (self, fn, sizeLimit = 0):
- self.fn = fn
- self.sizeLimit = sizeLimit
-
- self.load()
-
- def __str__ (self):
- """Returns a string representation of the SiteDB; can be used with loadFromString()
- later."""
- _siteDBRWLock.acquire_read()
- ret = '\n'.join(['%s\n%u %d' % ('.'.join(siteInfo[0]), siteInfo[1], siteInfo[2])
- for siteInfo in self._iter()])
- _siteDBRWLock.release()
-
- return ret
-
- def _load (self, s):
- """(INTERNAL) Load from a string 's' - NOT THREAD-SAFE!"""
- self.clear()
- self._count = 0
-
- isName = True
- for l in filter(lambda l:len(l), map(string.strip, s.split('\n'))):
- if isName:
- site = l.split('.')
- isName = False
- else:
- l = l.split(' ')
- self._addOne(site, int(l[0]), int(l[1]), False)
- isName = True
-
- if self.sizeLimit and self._count >= self.sizeLimit:
- self._prune()
-
- def load (self):
- # NOTE: MULTIPLE THREADS MAY LOAD THE DB MULTIPLE TIMES!
- _siteDBRWLock.acquire_write()
-
- try: f = openFile(self.fn)
- except:
- return
-
- s = f.read()
- self._load(s)
-
- _siteDBRWLock.release()
-
- def loadFromString (self, s):
- # NOTE: MULTIPLE THREADS MAY LOAD THE DB MULTIPLE TIMES!
- _siteDBRWLock.acquire_write()
-
- try:
- self._load(s)
- except Exception, e:
- _siteDBRWLock.release()
- raise e
-
- _siteDBRWLock.release()
-
- def size (self):
- _siteDBRWLock.acquire_read()
- ret = self._count
- _siteDBRWLock.release()
- return ret
-
- def setSizeLimit (self, sizeLimit):
- """Set size limit of the SiteDB; returns True iff pruning occurs (and something did
- get removed during the pruning)."""
- _siteDBRWLock.acquire_write()
- ret = False
-
- try:
- if self.sizeLimit > sizeLimit:
- self.sizeLimit = sizeLimit
- oldSize = self._count
- self._prune()
- ret = (self._count != oldSize)
-
- else: self.sizeLimit = sizeLimit
-
- except Exception, e:
- printException(u'Exception in SiteDB.setSizeLimit()', e)
-
- _siteDBRWLock.release()
-
- return ret
-
- def _addOne (self, site, timeSec, numTimes, prune):
- """Add one site (a sequence of name components) to the tree - the site can be numerical
- or non-numerical, and only significant parts are kept: for non-numerical names with
- _gTLD, it's the last 2 components; for non-numerical names with _ccTLD, it's the last
- 3 components; for numerical names it's the 4 components.
-
- Pruning happens when necessary, if prune is set to True.
-
- ASSUMPTION: site has at least two components.
- """
-
- tld = site[-1]
- level = 0
- theDict = self
-
- isGTLD = tld in _gTLD
- isCCTLD = tld in _ccTLD
-
- if not isGTLD and not isCCTLD:
- try:
- int(tld)
- except:
- # bogus TLD
- return
-
- # numerical names - add from the first component forward
- for comp in site:
- if theDict is None:
- theDict = {}
- node[-1] = theDict
-
- if level < 3:
- node = theDict.setdefault(comp, [0, 0, level, {}])
- theDict = node[-1]
- level += 1
- else:
- node = theDict.setdefault(comp, [0, 0, level, None])
- break
-
- if level != 3:
- # bogus IP address (must have 4 components)
- return
-
- else:
- if isGTLD: count = 1
- else: count = 2
-
- # non-numerical names - add from the last component backward
- for comp in site[::-1]:
- if theDict is None:
- theDict = {}
- node[-1] = theDict
- if level == count:
- node = theDict.setdefault(comp, [0, 0, level, None])
- break
- else:
- node = theDict.setdefault(comp, [0, 0, level, {}])
- theDict = node[-1]
- level += 1
-
- node[1] += numTimes
- nodeTime = node[0]
-
- if nodeTime == 0:
- self._count += 1 # this is the first time the site is added
- node[0] = timeSec
- if prune and self.sizeLimit and self._count > self.sizeLimit:
- self._prune()
-
- elif nodeTime < timeSec:
- node[0] = timeSec # update to the latest time
-
- def addOne (self, site, timeSec, numTimes = 1, prune = True):
- """A thread-safe version of _addOne()."""
- _siteDBRWLock.acquire_write()
- ret = self._addOne(site, timeSec, numTimes, prune)
- _siteDBRWLock.release()
-
- return ret
-
- def getOne (self, site):
- """Try to add a site (a sequence of name components) into the collection; returns a
- tuple (l, n, b), where l is a list of not-yet-added but significant name components,
- n is the deepest node we can find, and b is a boolean:
-
- 1. l is None if the last component is bogus (not one of the valid TLDs)
- 2. n is None if the last component is not in the collection (in that case l is
- the full list of name components)
- 3. b is True iff site is *not* numerical
-
- Examples:
- 1. sites.getOne(('yahoo','com')) -> ((), n, True), where n is not None: 'yahoo.com' is
- in the collection;
- 2. sites.getOne(('www','yahoo','com')) -> ((), n, True), where n is not None: same as
- above;
- 3. sites.getOne(('cnet','com')) -> (('cnet'), n, True), where n is not None: there is
- already at least one .com site added, but cnet.com is not added before;
- 4. sites.getOne(('cnet','com')) -> (('cnet','com'), None, True): no .com site is added
- before;
- 5. sites.getOne(('rubbish','bogus')) -> (None, None, True); the last component is bogus.
- 6. sites.getOne(('128','2','3','4')) -> (('2','3','4'), None, False); at least one IP
- starting with 128 has been added before.
- """
-
- tld = site[-1]
- theDict = self
- prevNode = None
-
- if tld in _gTLD:
- count = 2
- elif tld in _ccTLD:
- count = 3
- else:
- try:
- int(tld)
- count = 0
- except:
- return None, None, True
-
- _siteDBRWLock.acquire_read()
-
- if count > 0:
- # non-numerical name
- for comp in site[::-1]:
- node = theDict.get(comp)
- if node is None: break
- else:
- count -= 1
- prevNode = node
- theDict = node[-1]
- if theDict is None: break
-
- if prevNode is None:
- _siteDBRWLock.release()
- return site[-count:], None, True
- else:
- endPos = -(prevNode[2] + 1)
- _siteDBRWLock.release()
- return site[endPos-count:endPos], prevNode, True
-
- else:
- # numerical name
- for comp in site:
- try: int(comp)
- except:
- _siteDBRWLock.release()
- return None, None, True
-
- node = theDict.get(comp)
- if node is None: break
- else:
- count += 1
- prevNode = node
- theDict = node[-1]
- if theDict is None: break
-
- _siteDBRWLock.release()
- return site[count:4], prevNode, False
-
- def addOneToNode (self, nameComps, node, notNumerical, timeSec, numTimes = 1):
- """Add one site with prefix/suffix in nameComps (a sequence; prefix if notNumerical is
- True, suffix otherwise) to node, so that in effect the complete name of the site is
- '.'.join(nameComps)+suffix (non-numeric), or prefix+'.'.join(nameComps) (numeric),
- where suffix/prefix is the name components collected from the root node to 'node'.
-
- Pruning happens when necessary.
-
- ASSUMPTION: nameComps contains only the significant components (based on what
- the TLD is) - this function is intended to be used with getOne().
-
- TO-DO: if any exception occurs _siteDBRWLock might still be locked in write mode!
- """
- if len(nameComps):
- level = node[2] + 1
- theDict = node[-1]
-
- if notNumerical:
- nameComps.reverse()
-
- _siteDBRWLock.acquire_write()
-
- for comp in nameComps[:-1]:
- if theDict is None:
- theDict = {}
- node[-1] = theDict
- node = theDict.setdefault(comp, [0, 0, level, {}])
- theDict = node[-1]
- level += 1
- if theDict is None:
- theDict = {}
- node[-1] = theDict
- node = theDict.setdefault(nameComps[-1], [0, 0, level, None])
-
- else: _siteDBRWLock.acquire_write()
-
- node[1] += numTimes
- nodeTime = node[0]
- if nodeTime == 0:
- self._count += 1
- node[0] = timeSec
- if self.sizeLimit and self._count > self.sizeLimit:
- self._prune()
- elif nodeTime < timeSec:
- node[0] = timeSec
-
- _siteDBRWLock.release()
-
- def _removeOne (self, site, countMatters = False):
- """Removes a site (a sequence of name components) from the collection: if countMatters
- is True, decrements the count by 1 (and removes the entire entry if count becomes 0);
- if not, removes the entire entry; in both cases the timestamp will not be updated.
-
- Returns True iff change did occur.
-
- ASSUMPTION: site is syntactically correct (no int(.) test on each component of an
- IP address).
-
- WARNING: NOT thread-safe - use removeOne() for that!
- """
-
- tld = site[-1]
- theDict = self
- path = []
-
- if tld in _gTLD:
- count = 2
- elif tld in _ccTLD:
- count = 3
- else:
- try:
- int(tld)
- count = 0
- except: return False
-
- # find a path from the leaf to the root
- if count > 0:
- for comp in site[::-1]:
- node = theDict.get(comp)
- if node is None:
- return False
- else:
- path.append([comp, node])
- count -= 1
- if count == 0: break
- theDict = node[-1]
- if theDict is None:
- return False
-
- else:
- for comp in site:
- node = theDict.get(comp)
- if node is None:
- return False
- else:
- path.append([comp, node])
- count += 1
- if count == 4: break
- theDict = node[-1]
- if theDict is None:
- return False
-
- # remove the path, from bottom up
- if countMatters:
- key, node = path[-1]
-
- # decrement the count
- if node[1]:
- node[1] -= 1
- else:
- # not a leaf node - don't do anything
- return False
-
- if node[1] == 0:
- # count is decremented to zero for this node, remove it
- self._count -= 1
- for nextKey, node in path[-2::-1]:
- theDict = node[-1]
- del theDict[key]
-
- # if this node has other children, or itself represents a site
- # don't try to remove it from its parent's dictionary
- if len(theDict) or node[1]: break
-
- key = nextKey
- else:
- theDict = self
- del theDict[key]
-
- else:
- self._count -= 1
- key = path[-1][0]
- for nextKey, node in path[-2::-1]:
- theDict = node[-1]
- del theDict[key]
-
- # if this node has other children, or itself represents a site
- # don't try to remove it from its parent's dictionary
- if len(theDict) or node[1]: break
-
- key = nextKey
- else:
- theDict = self
- del theDict[key]
-
- return True
-
- def removeOne (self, site, countMatters = False):
- """A thread-safe version of _removeOne()."""
- _siteDBRWLock.acquire_write()
-
- try:
- ret = self._removeOne(site, countMatters)
- except Exception, e:
- printException(u'Exception in SiteDB.removeOne().', e)
-
- _siteDBRWLock.release()
-
- return ret
-
- def _prune (self):
- """Recency-based pruning: throw away the oldest n records; NOT thread-safe (use prune()
- for that)."""
- # n is the # of entries we want to prune
- n = self._count - int(self.sizeLimit * SAFE_SITE_SIZE_RATIO)
- if n <= 0: return
-
- l = [[[siteInfo[1], siteInfo[2]], copy.copy(siteInfo[0])]
- for siteInfo in self._iter()]
- l.sort() # sorted by time, then # of occurrence, then names
- for i in l[:n]: self._removeOne(i[1])
-
- def prune (self):
- """A thread-safe version of _prune()."""
- _siteDBRWLock.acquire_write()
-
- try:
- self._prune()
- except Exception, e:
- printException(u'Exception in SiteDB.prune().', e)
-
- _siteDBRWLock.release()
-
- def recycle (self):
- _siteDBRWLock.acquire_write()
-
- try:
- self.clear()
- self._count = 0
- os.remove(self.fn)
-
- except Exception, e:
- printException(u'Exception in SiteDB.recycle()', e)
-
- _siteDBRWLock.release()
-
- def _iter (self):
- """Iterates over the tree. Returns a list [site name, time, # of occurrences], where
- site name is a list of name components, time is # of seconds since epoch when the last
- time the site is updated.
-
- WARNING: NOT THREAD-SAFE!
- """
- stack = [[k,v] for k, v in self.items()]
- nameComps = []
- while len(stack):
- node = stack[0]
- del stack[0]
-
- payload = node[-1]
- level = payload[2]
-
- if level == 0: del nameComps[:]
- elif level < len(nameComps): del nameComps[0:-level]
- nameComps.insert(0, node[0])
-
- if payload[1]:
- # it's a site
- try:
- int(nameComps[-1])
- yield [nameComps[::-1], payload[0], payload[1]]
- except:
- # non-numerical name
- yield [nameComps,payload[0], payload[1]]
-
- if payload[-1] is not None:
- # not a leaf node - push the children
- for k,v in payload[-1].items()[::-1]:
- stack.insert(0,[k,v])
-
- def removeSitesUsingPattern (self, sitesPattern):
- """Removes any site that matches sitesPattern (a regex pattern); returns True iff
- there's any change."""
- removeList = []
-
- _siteDBRWLock.acquire_read()
- for siteInfo in self._iter():
- siteName = '.'.join(siteInfo[0])
- if sitesPattern.search(siteName): removeList.append(siteName)
- _siteDBRWLock.release()
-
- # it's possible some sites are already gone by now (since it's not locked here)
- # but it's okay - removeOne() is tolerant enough
- if len(removeList):
- _siteDBRWLock.acquire_write()
-
- try:
- noChange = True
- for site in removeList:
- if self._removeOne(site.split('.')) and noChange:
- noChange = False
- except Exception, e:
- printException(u'Exception in SiteDB.removeSitesUsingPattern()', e)
-
- _siteDBRWLock.release()
- return not noChange
-
- return False
-
- def writeToFile (self, fn = None):
- if fn: f = openFile(fn, 'w')
- else: f = openFile(self.fn, 'w')
- f.write(self.__str__())
-
- def loadIntoList (self, siteList):
- """Load into siteList - a list of dict with keys 'name', 'time', and 'count'."""
- _siteDBRWLock.acquire_read()
-
- tmpList = map(lambda siteInfo: (siteInfo[1],
- {'name':'.'.join(siteInfo[0]),
- 'time':datetime.datetime.utcfromtimestamp(siteInfo[1]),
- 'count':siteInfo[2]}),
- self._iter())
-
- tmpList.sort()
- tmpList.reverse()
- siteList[:] = map(lambda i:i[1], tmpList)
-
- _siteDBRWLock.release()
-
-
- if __name__ == '__main__':
- siteDB = SiteDB('%ssiteDB' % CONF_PATH, 2000)
-
- l, n, b = siteDB.getOne(('xyz', 'com'))
- print l, n is not None, b
- l, n, b = siteDB.getOne(('xyz', 'abc', 'biz'))
- print l, n is not None, b
- l, n, b = siteDB.getOne(('xyz', 'abc', 'com', 'tw'))
- print l, n is not None, b
- l, n, b = siteDB.getOne(('xyz', 'abc', 'com', 'il'))
- print l, n is not None, b
- l, n, b = siteDB.getOne(('com', )) # so don't pass this to SiteDB!
- print l, n is not None, b
- l, n, b = siteDB.getOne(('rubbish', 'bogus'))
- print l, n is not None, b
- l, n, b = siteDB.getOne(('66', '0', '78', '20'))
- print l, n is not None, b
-
- print
- print '* before removal:', siteDB.size()
- print '* any "biz" site removed?', siteDB.removeSitesUsingPattern(re.compile(r'\.biz$'))
- print '* any "edu" site removed?', siteDB.removeSitesUsingPattern(re.compile(r'\.edu$'))
- print '* after removal:', siteDB.size()
- print
-
- siteDB.removeOne('24info.com.tw'.split('.'))
- s = str(siteDB)
- siteDB.loadFromString(s)
-
- # test writing the content to a file
- #siteDB.writeToFile('sites.out')
- #print '* Wrote %d sites to "sites.out".' % siteDB.size()
-
- #siteList = []
- #siteDB.loadIntoList(siteList)
- #print '\n'.join(['%s (%d): %s' % (s['name'], s['count'], s['time'])
- # for s in siteList])
-